# This script creates multiple datasets for training the PRS only model, with different degrees of oversampling..
# Oversampling performed using ADASYN
# Python version 3.6.8 is used

# Imports
import os
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from collections import Counter
from imblearn.over_sampling import ADASYN

# Set working directory
os.chdir("/../../..")

# Load PRS data - data found in IOWBC_PRS_data.xlsx, sheet: "IOWBC PRS"
data = pd.read_csv("PRS_116snp_Asthma10YR_Adjusted.csv", index_col=False)
del data['FID']
del data['In_Regression']
# 924 IDs 

# drop IDs without an Asthma_10YR outcome
data1 = data.dropna()
Counter(data1.Asthma_10YR)
# 908 IDs - 767 controls, 141 cases

# Recode the Asthma_10YR variable from the PRS analysis to: 0=controls (1 in PRS analysis), 1=cases (2 in PRS analysis)
data1['Asthma_10YR'] = np.where(data1['Asthma_10YR']==2.0, 1, 0)

# Split dataset into a training and test set
	# The PRS test set will contain individuals who are present in either the infancy or preschool test sets - these IDs were identified from the test sets used to develop the clinical ML models:
	#	Preschool_standardised_test_dataset_183IDs.csv - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised preschool test set"
	#	Earlylife_standardised_test_dataset_255IDs.csv - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised earlylife test set"
		
earlylife_test = pd.read_csv("Earlylife_standardised_test_dataset_255IDs.csv", index_col=False)
del earlylife_test['Unnamed: 0']
# 255 IDs
preschool_test = pd.read_csv("Preschool_standardised_test_dataset_183IDs.csv", index_col=False)
del preschool_test['Unnamed: 0']
# 183 IDs

# Subset ids in initial early life/preschool test sets into PRS test set
PRS_test = data1[(data1.IID.isin(earlylife_test.Study_ID)) | (data1.IID.isin(preschool_test.Study_ID))]
# 267 IDs

PRS_train = data1[(~data1.IID.isin(earlylife_test.Study_ID)) & (~data1.IID.isin(preschool_test.Study_ID))]
# 641 IDs

# Save PRS training and test datasets
PRS_test.to_csv("PRSonly_model_267ID_test_dataset.csv") - data found in IOWBC_PRS_data.xlsx, sheet: "PRS test set"
PRS_train.to_csv("PRSonly_model_641ID_training_dataset.csv") - data found in IOWBC_PRS_data.xlsx, sheet: "PRS training set"

# Standardise PRS training dataset
scaler = StandardScaler()
x = PRS_train.copy()
del x['IID']
del x['Asthma_10YR']
PRSsc_train = pd.DataFrame(scaler.fit_transform(x.iloc[:,:]))
PRSsc_train.columns=['PRS_scaled']
PRS_SXY_train = pd.concat([PRS_train.reset_index(drop=True), PRSsc_train], axis=1)
PRS_SXY_train = PRS_SXY_train[['IID','PRS_scaled','Asthma_10YR']]
PRS_SXY_train.to_csv("PRSonly_model_641ID_standardised_training_dataset.csv")  - data found in IOWBC_PRS_data.xlsx, sheet: "PRS standardised training set"
Counter(PRS_SXY_train.Asthma_10YR)
# Controls: 538, Cases: 103}) ~ 16.1% asthmatic

# Standardise test set
y = PRS_test.copy()
del y['IID']
del y['Asthma_10YR']
PRSsc_test = pd.DataFrame(scaler.transform(y.iloc[:,:]))
PRSsc_test.columns=['PRS_scaled']
PRS_SXY_test = pd.concat([PRS_test.reset_index(drop=True), PRSsc_test], axis=1)
PRS_SXY_test = PRS_SXY_test[['IID','PRS_scaled','Asthma_10YR']]
PRS_SXY_test.to_csv("PRSonly_model_267ID_standardised_test_dataset.csv") - data found in IOWBC_PRS_data.xlsx, sheet: "PRS standardised test set"

Counter(PRS_SXY_test.Asthma_10YR)
# Controls: 229, Cases: 38 ~14.2% asthmatic

######################################
### Construct oversampled datasets ###
######################################
# Oversample +/- undersample the dataset
# Oversampling is done on the standardised training dataset. 
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
SX_train = PRS_SXY_train.drop(['IID','Asthma_10YR'], axis=1)
y_train = PRS_SXY_train['Asthma_10YR']
#Counter({0.0: 569, 1.0: 106})
# Save training set IDs
Train_IDs = PRS_SXY_train.iloc[:,0].to_frame()


### Oversample cases by 25% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(145/538), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 538, 1: 144})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_25_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_25_train.columns = ['Study_ID', 'PRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_25 = pd.concat([Oversampled_25_train, Oy_train_df], axis=1)
Oversampled_25.to_csv("PRS_only_standardised_oversampled_training_dataset_25%.csv", index=False)


### Oversample cases by 50% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(156/538), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0.0: 538, 1.0: 178})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_50_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_50_train.columns = ['Study_ID', 'PRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_50 = pd.concat([Oversampled_50_train, Oy_train_df], axis=1)
Oversampled_50.to_csv("PRS_only_standardised_oversampled_training_dataset_50%.csv", index=False)


### Oversample cases by 100% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(208/538), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 538, 1: 206})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_100_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_100_train.columns = ['Study_ID', 'PRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_100 = pd.concat([Oversampled_100_train, Oy_train_df], axis=1)
Oversampled_100.to_csv("PRS_only_standardised_oversampled_training_dataset_100%.csv", index=False)


### Oversample cases by 150% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(261/538), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 538, 1: 281})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_150_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_150_train.columns = ['Study_ID', 'PRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_150 = pd.concat([Oversampled_150_train, Oy_train_df], axis=1)
Oversampled_150.to_csv("PRS_only_standardised_oversampled_training_dataset_150%.csv", index=False)


### Oversample cases by 200% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(313/538), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 538, 1: 345})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_200_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_200_train.columns = ['Study_ID', 'PRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_200 = pd.concat([Oversampled_200_train, Oy_train_df], axis=1)
Oversampled_200.to_csv("PRS_only_standardised_oversampled_training_dataset_200%.csv", index=False)


### Oversample cases by 250% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(366/538), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 538, 1: 379})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_250_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_250_train.columns = ['Study_ID', 'PRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_250 = pd.concat([Oversampled_250_train, Oy_train_df], axis=1)
Oversampled_250.to_csv("PRS_only_standardised_oversampled_training_dataset_250%.csv", index=False)


### Oversample cases by 300% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(412/538), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 538, 1: 420})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_300_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_300_train.columns = ['Study_ID', 'PRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_300 = pd.concat([Oversampled_300_train, Oy_train_df], axis=1)
Oversampled_300.to_csv("PRS_only_standardised_oversampled_training_dataset_300%.csv", index=False)
